Gender bias in speakers and career position

Data

EcoEncontros Seminar talks

Talks from EcoEncontros Seminar series at the Graduate Program of Ecology in the University of SĂŁo Paulo (PPGE-USP), Brazil

See file metadata.txt, in folder data for more description and detail of the dataset.

data <- read.table("data/presentations_PPGE_2008-2019.csv", sep=",",
                   header=T, as.is=T)
data$date <- dmy(data$date)
data$year <- year(data$date) 
#skimr::skim(data)

Excluding special events as round tables and discussions not related to a project or study presented by someone.

IDs <- c(154, 250, 211, 289)
data <- data %>% filter(!id %in% IDs)

For this specific analysis, excluding speakers that are not in academia (“others”), and keeping undergraduate students, MD and PhD in the group student. postdoc, professor or researcher*.

*Researchers are included in the professor categorical position (column position_cat) because all of them come from research institutions.

data <- data %>% filter(position_cat != "others")
data$position_cat <- fct_relevel(data$position_cat, "student", 
                                 "postdoc","professor")

Excluding seminars with more than one speaker

events <- data %>% count(id) %>% filter(n>1)
data2 <- data %>% filter(!id %in% events$id,
                        !is.na(audience_n))
dim(data)
## [1] 330  30

Data description

Audience by gender and academic position

ggplot(data, aes(x=position_cat, y=audience_n, fill=gender)) +
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
  geom_boxplot()

  #geom_violin(position = position_dodge(0.8)) +
  #geom_jitter(position=position_jitterdodge(0.2),shape=21)
library(ggbeeswarm)
# outra opção
ggplot(data, aes(x=position_cat, y=audience_n, fill=gender)) +
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
  scale_color_manual(values = c("#b2abd2", "#fdb863"))+
  geom_violin(col="black") +
  geom_quasirandom(dodge.width = 0.9, shape=21)+
  stat_summary(fun.y=median, aes(ymin=..y.., ymax=..y..),geom='errorbar', 
               width=0.8, size=0.8, position = position_dodge(width = 0.9))+
  xlab("") + ylab("Audience (N)")

Variation in time

ggplot(data, aes(x=date, y=audience_n, fill=gender)) +
  facet_wrap(~position_cat, ncol=1)+
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
  scale_color_manual(values = c("#b2abd2", "#fdb863"))+
  geom_quasirandom(dodge.width = 0.9, shape=21)+
  geom_smooth()+
  xlab("") + ylab("Audience (N)")

Looking for possible biases for speakers from inside and outside PPGE.

data$ppge <- ifelse(data$origin == "IB", "inside", "outside")
table(data$gender,data$ppge)
##    
##     inside outside
##   F     88      53
##   M     91      98
ggplot(data, aes(x=ppge, y=audience_n, fill=gender)) +
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
  scale_color_manual(values = c("#b2abd2", "#fdb863"))+
  geom_violin(col="black") +
  geom_quasirandom(dodge.width = 0.9, shape=21)+
  stat_summary(fun.y=median, aes(ymin=..y.., ymax=..y..),geom='errorbar', 
               width=0.8, size=0.8, position = position_dodge(width = 0.9))+
  xlab("PPGE") + ylab("Audience (N)")

Looking for possible biases for speakers from Brazil and abroad.

data$brazilian <- ifelse(data$country == "Brasil", "yes", "no")
table(data$gender,data$brazilian)
##    
##      no yes
##   F  25 116
##   M  51 138
ggplot(data, aes(x=brazilian, y=audience_n, fill=gender)) +
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
  scale_color_manual(values = c("#b2abd2", "#fdb863"))+
  geom_violin(col="black") +
  geom_quasirandom(dodge.width = 0.9, shape=21)+
  stat_summary(fun.y=median, aes(ymin=..y.., ymax=..y..),geom='errorbar', 
               width=0.8, size=0.8, position = position_dodge(width = 0.9))+
  xlab("Brazilian") + ylab("Audience (N)")

Modeling

Negative binomial

mg0 <- glm.nb(audience_n~ 1, data=data)
mg1 <- glm.nb(audience_n~ gender, data=data)
mg2 <- glm.nb(audience_n~ position_cat, data=data)
mg3 <- glm.nb(audience_n~ year, data=data)

mg4 <- glm.nb(audience_n~ gender + position_cat, data=data)
mg5 <- glm.nb(audience_n~ gender + year, data=data)
mg6 <- glm.nb(audience_n~ year + position_cat, data=data)

mg7 <- glm.nb(audience_n~ gender*position_cat, data=data)
mg8 <- glm.nb(audience_n~ gender*year, data=data)
mg9 <- glm.nb(audience_n~ year*position_cat, data=data)

mg10 <- glm.nb(audience_n~ gender*position_cat*year, data=data)

AICtab(mg2, mg0,mg1, mg3, mg4,mg5,mg6,mg7,mg8,mg9,mg10)
##      dAIC df
## mg7   0.0 7 
## mg4   1.7 5 
## mg6   2.2 5 
## mg2   4.1 4 
## mg10  4.4 13
## mg9   5.2 7 
## mg5  30.3 4 
## mg1  30.9 3 
## mg8  31.9 5 
## mg3  42.5 3 
## mg0  42.7 2

Residual diagnostic

hnp::hnp(mg7)
## Negative binomial model (using MASS package)

plot(simulateResiduals(mg7))

hnp::hnp(mg4)
## Negative binomial model (using MASS package)

plot(simulateResiduals(mg4))

Models result

summary(mg7)
## 
## Call:
## glm.nb(formula = audience_n ~ gender * position_cat, data = data, 
##     init.theta = 6.535878002, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9224  -0.7849  -0.1143   0.5305   3.6658  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    2.93339    0.04760  61.621   <2e-16 ***
## genderM                        0.05745    0.06888   0.834    0.404    
## position_catpostdoc            0.01944    0.10234   0.190    0.849    
## position_catprofessor          0.13000    0.10127   1.284    0.199    
## genderM:position_catpostdoc   -0.06749    0.13922  -0.485    0.628    
## genderM:position_catprofessor  0.26468    0.12339   2.145    0.032 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(6.5359) family taken to be 1)
## 
##     Null deviance: 392.02  on 329  degrees of freedom
## Residual deviance: 334.86  on 324  degrees of freedom
## AIC: 2401.8
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  6.536 
##           Std. Err.:  0.657 
## 
##  2 x log-likelihood:  -2387.781
performance::r2(mg7)
## # R2 for Generalized Linear Regression
##   Nagelkerke's R2: 0.229
myg7 <- ggpredict(mg7, terms=c("position_cat","gender"))
plot(myg7) +
  scale_color_manual(values = c("#b2abd2", "#fdb863"))

summary(mg4)
## 
## Call:
## glm.nb(formula = audience_n ~ gender + position_cat, data = data, 
##     init.theta = 6.390867873, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8952  -0.7755  -0.1604   0.4810   3.7995  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            2.90807    0.04276  68.010  < 2e-16 ***
## genderM                0.11084    0.05196   2.133   0.0329 *  
## position_catpostdoc   -0.02214    0.06993  -0.317   0.7516    
## position_catprofessor  0.31862    0.05776   5.517 3.45e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(6.3909) family taken to be 1)
## 
##     Null deviance: 385.44  on 329  degrees of freedom
## Residual deviance: 334.94  on 326  degrees of freedom
## AIC: 2403.5
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  6.391 
##           Std. Err.:  0.638 
## 
##  2 x log-likelihood:  -2393.505
performance::r2(mg4)
## # R2 for Generalized Linear Regression
##   Nagelkerke's R2: 0.206
myg4 <- ggpredict(mg4, terms=c("position_cat","gender"))
plot(myg4) +
  scale_color_manual(values = c("#b2abd2", "#fdb863"))

Only professors - productivity metrics

Investigating if differences in productivity between male and female professors and researches are related to the audience.

Measured productivity publication metrics from Google Scholar for professors and researchers.

Creating productivity index using PCA 1st axis from metrics.

PCA productivity metrics

dp <- data %>% filter(!is.na(data$total_citation_n),
                      !is.na(data$nature_index_count))
table(dp$gender)
## 
##  F  M 
## 20 69

Productivity publication metrics

pca1 <- PCA(dp[, c(21:28)], graph=F)
p1 <- fviz_pca_biplot(pca1, col.ind = dp$gender, addEllipses=TRUE,
                      col.ind.sub="none",  geom="point",
                      repel = TRUE) +
  scale_color_manual(name="gender",values = c("#b2abd2", "#fdb863"))+
  scale_fill_manual(name="gender",values = c("#b2abd2", "#fdb863"))
p1

Extracting PCA 2 first axes

dp$pc1 <- pca1$ind$coord[,1]
dp$pc2 <- pca1$ind$coord[,2]

Modeling

m0 <- glm.nb(audience_n ~ 1, data=dp)
m1 <- glm.nb(audience_n ~ gender, data=dp)
m2 <- glm.nb(audience_n ~ pc1 + pc2, data=dp)

m3 <- glm.nb(audience_n ~ gender + pc1 + pc2, data=dp)
m4 <- glm.nb(audience_n ~ gender*pc1 + gender*pc2, data=dp)

AICtab(m0,m1,m2,m3,m4, base=T, weights=T)
##    AIC   dAIC  df weight
## m3 709.3   0.0 5  0.556 
## m4 711.5   2.3 7  0.179 
## m2 711.9   2.7 4  0.148 
## m1 712.6   3.4 3  0.103 
## m0 716.6   7.3 2  0.014

Residual diagnostic

Best model

hnp(m3)
## Negative binomial model (using MASS package)

plot(simulateResiduals(m3))

## Model results

summary(m3)
## 
## Call:
## glm.nb(formula = audience_n ~ gender + pc1 + pc2, data = dp, 
##     init.theta = 5.308640158, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5620  -0.7413  -0.2038   0.4620   3.3773  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.11234    0.10922  28.497  < 2e-16 ***
## genderM      0.27429    0.12375   2.217  0.02666 *  
## pc1          0.07082    0.02460   2.879  0.00399 ** 
## pc2         -0.01647    0.03867  -0.426  0.67009    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(5.3086) family taken to be 1)
## 
##     Null deviance: 105.618  on 88  degrees of freedom
## Residual deviance:  91.338  on 85  degrees of freedom
## AIC: 709.27
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  5.309 
##           Std. Err.:  0.935 
## 
##  2 x log-likelihood:  -699.267
performance::r2(mg3)
## # R2 for Generalized Linear Regression
##   Nagelkerke's R2: 0.010
my3 <- ggpredict(m3, terms=c("pc1","gender"))
plot(my3) +
  scale_color_manual(values = c("#b2abd2", "#fdb863"))+
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
   theme_cowplot() + ggtitle("") +
  ylab("Audience (N)") + xlab("Productivity (PC1 axis)")

plot(my3) +
  scale_color_manual(values = c("#b2abd2", "#fdb863"))+
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
   theme_cowplot() + ggtitle("") +
  ylab("Audience (N)") + xlab("Productivity (PC1 axis)")

my3 <- ggpredict(m3, terms=c("pc2","gender"))
plot(my3) +
  scale_color_manual(values = c("#b2abd2", "#fdb863"))+
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
   theme_cowplot()